Twitter data has recently been one of the most favorite dataset for Natural Language Processing (NLP) researchers. In this assignment analysed tweets that were collected during the airing of the 'Red Wedding' episode of Game of Thrones uing Natural Language Processing.
Natural language processing (NLP) is a branch of artificial intelligence that helps computers understand, interpret and manipulate human language. Basic NLP tasks include tokenization and parsing, lemmatization/stemming, part-of-speech tagging, language detection and identification of semantic relationships.
import numpy as np
import pandas as pd
import re, string, unicodedata
import nltk
from nltk import word_tokenize, FreqDist
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.tokenize import TweetTokenizer
pd.set_option('display.max_columns', None)
data = pd.read_csv('got_tweets.csv')
data.head(5)
!pip install emoji
!pip install num2words
from num2words import num2words
from nltk.tokenize import word_tokenize
!pip install --upgrade gensim
from gensim import corpora, models
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
!pip install wordcloud
from wordcloud import WordCloud
from os import path
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
data
Parsed all the hashtags out of the texts, then counted their overall occurrences and found 10 most common hashtags. Made a bar chart of these top 10 hashtags.
Firstly using Regex (Regular Expressions) selected hastags and assigned to data[hashtag] column Calculated the length of the list / number of entries! (! This is with duplicate entries) Transformed the list of hastags into a dataframe Renamed the columns of the dataframe, initialized punctuations string. Calculated the number of time a hashtag appear in the tweets. Ordered the dataframe with the highest number of words at the top. Droped the duplicates of the words.Showed only the top ten first word in bar chart.
# Firstly using Regex (Regular Expressions) selected hastags and assigned to data[hashtag] column
data['hashtag'] = data['text'].apply(lambda x: re.findall(r"(#\S+)", x.lower()))
data.head(5)
hashtags = []
for sublist in data.hashtag:
for word in sublist:
hashtags.append(word)
#hashtags
# Calculate the length of the list / number of entries! (! This is with duplicate entries)
nb_hashtags = len(hashtags)
print("Total number of hashtags is " + str(nb_hashtags) + ".")
# Transform the list of hastags into a dataframe
hashtags_df = pd.DataFrame(hashtags)
# Rename the columns of the dataframe
hashtags_df.columns = ['hashtags']
## initializing punctuations string
hashtags_df['hashtags'] = hashtags_df.hashtags.str.replace('[^\w\s]','')
# Calculate the number of time a hashtag appear in the tweets
hashtags_df['count'] = hashtags_df.groupby('hashtags')['hashtags'].transform('count')
hashtags_df['count']
# Order the dataframe with the highest number of words at the top.
hashtags_df = hashtags_df.sort_values(by=['count'], ascending=False)
# Drop the duplicates of the words! Show only the top ten first word!
hashtags_df = hashtags_df.drop_duplicates(['hashtags'], keep='first')
hashtags_df[:10]
ax=hashtags_df[:10].plot.bar(x='hashtags', y='count', rot=90)
ax.set_ylabel('count of hashtags')
noduplicate_hashtags = len(hashtags_df)
print("Total number of single hashtags is " + str(noduplicate_hashtags) + ".")
Tokenize the text of the tweets, and gather the 'real' words for each tweet.
By 'real' words, there should be:
# markCount word occurrences, make a histogram of the occurrences. What are the top words? Are they what you expected?
What crazy words did you get? Explain possible approaches, with which you could throw out this kind of junk text as well.
Firstly removed emojis. Then removed url and mentions using preprocessor library. removed punctiation. tokenized words and converted lowercase removed stopwords and numeric characters. Took all the tokens (separated words) and append them in a list I have a giant list of all words that have been writen in the tweets Calculated the length of the list / number of entries! Turned the list into a dataframe Calculated the number of time a word appear in the tweets Order the dataframe with the highest number of words at the top. Add the frequency of occurance of the words! Drop the duplicates of the words! Show only the top ten first word! Top ten words are 'game', 'thrones', 'episode', 'last', 'im', 'de', 'fuck', 'wedding', 'watching', 'still' I think that they are very related to game of thrones as I expected.
#!pip install emoji
#Encodes all the data into ASCII values #and ignore if the data can not be encoded.
#After encoding it tries to decode them all again
#because all the emojis were ignored in the encoding process.
#So now we have all the data without emojis.
#import emoji
data = data.astype(str).apply(lambda x: x.str.encode('ascii', 'ignore').str.decode('ascii'))
STOP_WORDS = stopwords.words('english')
import preprocessor as p #Adding the cleaned (After removal of URLs, Mentions) tweets to a new column as a new feature ‘text
def preprocess_tweet(row):
text = row['text']
text = p.clean(text)
return text
data['textclean'] = data.apply(preprocess_tweet, axis=1)
data['textclean']
#After removal of URLs, Mentions
def remove_punct(text): #remove punctiation
text_nopunct = ''.join([char for char in text if char not in string.punctuation])
return text_nopunct
data['textcleanpunct'] = data['textclean'].apply(lambda x: remove_punct(x))
# this is how you can drop any column. --> train_data.drop('text_clean', axis=1, inplace=True)
data['textcleanpunct']
#function to tokenize words
def tokenize(text):
tokens = re.split('\W+',text) #W+ means that either a word character (A-Z) or a dash(-) can go there.
return tokens
#converting to lowercase as python is case-sensitive
data['textlower'] = data['textcleanpunct'].apply(lambda x: tokenize(x.lower()))
data['textlower']
stopword = nltk.corpus.stopwords.words('english') #all english stopwords
#function to remove stopwords and numeric characters and return length of word >1
def remove_stopwords(tokenized_list):
text = [word for word in tokenized_list if word not in stopword and word.isalpha() and len(word)>1]
return text
data['removewtopwords'] = data['textlower'].apply(lambda x: remove_stopwords(x))
data['removewtopwords']
# Take all the tokens (separated words) and append them in a list
# I have a giant list of all words that have been writen in the tweets
tokens = []
for sublist in data.removewtopwords:
for word in sublist:
tokens.append(word)
# Calculate the length of the list / number of entries!
nb_tokens = len(tokens)
print("The total number of words for all the tweets is " + str(nb_tokens) + ".")
# Turn the list into a dataframe -> easier to deal with!
tokens_df = pd.DataFrame(tokens)
tokens_df.columns = ['words']
# Calculate the number of time a word appear in the tweets
tokens_df['count'] = tokens_df.groupby('words')['words'].transform('count')
# Order the dataframe with the highest number of words at the top.
tokens_df = tokens_df.sort_values(by=['count'], ascending=False)
# Drop the duplicates of the words! Show only the top ten first word!
tokens_df = tokens_df.drop_duplicates(['words'], keep='first')
#for ' ' values
tokens_df.columns = tokens_df.columns.str.replace(' ', '')
tokens_df.dropna(inplace=True)
index_names = tokens_df[(tokens_df['words']=='')].index
tokens_df.drop(index_names, inplace = True)
tokens_df[:10].plot.bar(x='words', y='count', rot=90);
# Add the frequency of occurance of the words!
tokens_df['frequency_totwords'] = (tokens_df['count']/nb_tokens)*100
tokens_df[:10]
tokens_df[:10]['words'].tolist()
tokens_df[:10].plot.hist(x='words', y='count');
#tokens
Extract the stopword list for the English language with the help of nltk. Download the standard Brown Corpus also from nltk, count the relative frequency of stopwords in both the Brown Corpus and the GoT tweets. Make a scatterplot of your results, try to explain possible similarities and deviations. What is the correlation in the stopword frequencies of the two datasets?
Firstly I downloaded brown corpus then I found stopwords in brown dataset for words column. Then I added to the list. Calculated the length of the list Calculate the number of time a word appear. Ordered the dataframe with the highest number of words at the top. Droped the duplicates of the words! Showed only the top ten first words. Then I counted frequency of stopwords in brown corpus. And made a scatterplot that includes frequency of stopwords in brown corpus. Then I followed the same steps for got tweets. brown corpus top10 stop words: 'the', 'of', 'and', 'to', 'a', 'in', 'that', 'is', 'was', 'for' got tweets top10 stop words: 'of', 'the', 'i', 'to', 'a', 'that', 'in', 'just', 'was', 'what'
nltk.download('brown')
from nltk.corpus import brown
#brown=brown.words()
brown=list(brown.words())
#brown
###brown stop word
stopwordbrown= nltk.corpus.stopwords.words('english') #all english stopwords
browntext = [word for word in brown if word in stopwordbrown]#to return all stopwords
#browntext
# Calculate the length of the list / number of entries!
nb_brown = len(browntext)
print("The total number of brown for all the tweets is " + str(nb_brown) + ".")
# Turn the list into a dataframe
browntext_df = pd.DataFrame(browntext)
browntext_df.columns = ['stops']
# Calculate the number of time a word appear.
browntext_df['count'] = browntext_df.groupby('stops')['stops'].transform('count')
# Order the dataframe with the highest number of words at the top.
browntext_df = browntext_df.sort_values(by=['count'], ascending=False)
# Drop the duplicates of the words! Show only the top ten first world!
browntext_df = browntext_df.drop_duplicates(['stops'], keep='first')
browntext_df
browntext_df[:10].plot.bar(x='stops', y='count', rot=90);
# Add the frequency of occurance of the words!
browntext_df['frequency_stopwords'] = (browntext_df['count']/nb_tokens)*100
browntext_df[:10]
browntext_df[:10].plot.hist(x='stops', y='count');
brownstop_dftop10=browntext_df[:10]
brownstop_dftop10['stops'].tolist()
import seaborn as sns
import matplotlib.pyplot as plt
sns.scatterplot(data=brownstop_dftop10, x="stops", y="frequency_stopwords")
#for got dataset
stopwordgot= nltk.corpus.stopwords.words('english') #all english stopwords
def find_stopwords(stop_list):
text = [word for word in stop_list if word in stopwordgot]#to return all stopwords
return text
data['stopwordsall'] = data['textlower'].apply(lambda x: find_stopwords(x))
gotstopword=data['stopwordsall'].values.tolist()
#gotstopword
gotstop = []
for sublist in gotstopword:
for word in sublist:
gotstop.append(word)
# Calculate the length of the list / number of entries!
nb_stops = len(gotstop)
print("The total number of words for all the tweets is " + str(nb_stops) + ".")
# Turn the list into a dataframe -> easier to deal with!
gotstop_df = pd.DataFrame(gotstop)
gotstop_df.columns = ['gotstopwords']
gotstop_df
# Calculate the number of time a word appear in the tweets
gotstop_df['count'] = gotstop_df.groupby('gotstopwords')['gotstopwords'].transform('count')
gotstop_df
# Order the dataframe with the highest number of words at the top.
gotstop_df = gotstop_df.sort_values(by=['count'], ascending=False)
# Drop the duplicates of the words! Show only the top ten first world!
gotstop_df = gotstop_df.drop_duplicates(['gotstopwords'], keep='first')
gotstop_df
# Print the 10 most frequent words!
gotstop_df[:10].plot.bar(x='gotstopwords', y='count', rot=90);
# Add the frequency of occurance of the words!
gotstop_df['frequency_stopwords'] = (gotstop_df['count']/nb_tokens)*100
gotstop_df[:10]
gotstop_df[:10].plot.hist(x='gotstopwords', y='count');
gotstop_dftop10=gotstop_df[:10]
gotstop_dftop10
sns.scatterplot(data=gotstop_dftop10, x="gotstopwords", y="frequency_stopwords")
gotstop_dftop10 #brownstop_dftop10
brownstop_dftop10
x=gotstop_dftop10['gotstopwords']
x.tolist()
y=brownstop_dftop10['stops']
y.tolist()
A really common tool to visualize texts is a wordcloud. Find a suitable library and create a meaningful wordcloud of the GoT tweets (e.g. leave out punctuation, stopwords etc.)
I had deleted the punctuation and stopwords above. I rejoined meaningful stemmed words into a single string then created data['processed']. Generated a word cloud image importing libraries WordCloud, STOPWORDS
data['removewtopwords'] # I had deleted the punctuation and stopwords above
#I rejoined meaningful stemmed words into a single string.
def rejoin_words(data):
my_list = data['removewtopwords']
joined_words = ( " ".join(my_list))
return joined_words
data['processed'] = data.apply(rejoin_words, axis=1)
data['processed']
stopwords = set(STOPWORDS)
# Generate a word cloud image
text = "".join(word for word in data.processed)
wordcloud = WordCloud(stopwords=stopwords, background_color="black", width=800, height=400).generate(text)
# Display the generated image:
# the matplotlib way:
plt.axis("off")
plt.figure( figsize=(40,20))
plt.tight_layout(pad=0)
plt.imshow(wordcloud, interpolation='bilinear')
plt.show()
Define a time window in which all tweets count as one document. Create the term-document matrix of the tweets for this time segmentation. Apply stemming and stopword filtering.
I defined time window dataset and draw graph that shows all tweets count according to time. Then I created the term-document matrix of the tweets and applied stemming and stopword filtering.
# I created time window
data2=data[['created_at','processed']]
data2['freq']=data2.groupby(by='created_at')['created_at'].transform('count')
data3=data2.drop('processed',axis=1).drop_duplicates()
data3
data3['created_at'] = pd.to_datetime(data3['created_at'])
dataset_n = data3.set_index('created_at')
dataset_n.index
title_font= {"family" : "Cambria",
"size" : 15,
"color" : "black",
"weight" : "bold"}
plt.rcParams.update({'figure.figsize': (10,6), 'figure.dpi': 120})
by_time = dataset_n.groupby(dataset_n.index.time).sum()
hourly_ticks = 2 * 60 * 60 * np.arange(12)
by_time.plot(xticks=hourly_ticks, style='--o', color='blue')
plt.title('Frequency Per Hour', fontdict=title_font)
plt.xlabel('hour')
plt.ylabel('frequency')
plt.grid(axis='x')
plt.show();
I created term document matrix for tweets
data['processed']
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
c = CountVectorizer()
dtf=c.fit_transform(data['processed'].head(10).tolist()).todense() #Create the term-document matrix of the tweets
strfeature=' '.join(c.get_feature_names()) #It excluded so-called stopwords.
matrix=pd.DataFrame(dtf, columns =c.get_feature_names())
matrix
matrix.sum(axis=1)
#steming
text = "".join(word for word in data.processed )
nltk.download('punkt')
porter = PorterStemmer()
from nltk.tokenize import sent_tokenize, word_tokenize
def stemSentence(text):
token_words=word_tokenize(text)
token_words
stem_sentence=[]
for word in token_words:
stem_sentence.append(porter.stem(word))
stem_sentence.append(" ")
return "".join(stem_sentence)
x=stemSentence(text) # applied stemming and stopword filtering to text
#print(x)
Apply a TF-IDF weighting scheme for the term-document matrix by hand (e.g. do not use a built-in vectorizer, but normalize by text length with a summation etc. numpy or pandas is strongly suggested). Then, choose a topic detection method such as LSI or LDA, and run it on your matrix. Try to interpret your results! Are your topics meaningful? Which topics are the most representative of your document?
At this part I tried to apply TF-IDF weighting scheme for the term-document matrix. Then chose a LDA topic detection method. Most highest score topic with using TF_IDF and LDA is
Score: 0.7666631937026978
Topic: 0.016"game" + 0.016"thrones" + 0.015"cant" + 0.013"im" + 0.009"sad" + 0.009"ending" + 0.008"telling" + 0.008"watching" + 0.008"episode" + 0.008"traumatised" + 0.008"ive" + 0.008"show" + 0.008"time" + 0.007"books" + 0.007*"read
I understand that this episode end very sadly and traumatic and many viewers dissappointed
matrix.sum(axis=1)
matrix/matrix.sum(axis=1)
matrix.div(matrix.sum(axis=1))
matrix.div(matrix.sum(axis=1), axis=0)
matrix.div(matrix.sum(axis=1), axis=0)
matrix2=matrix.astype(bool).sum()
matrix.astype(bool).sum()['de']
import math
matrix.div(matrix.sum(axis=1), axis=0)
matrix2=matrix.astype(bool).mean()
matrix3=matrix2.apply(lambda x:-math.log(1/(1+x)))
matrix4=matrix.div(matrix.sum(axis=1), axis=0)
matrix5=matrix4.mul(matrix3,axis=1)
matrix5 #TF-IDF
Processeddata=data['removewtopwords']
Processeddata
dictionary = corpora.Dictionary(Processeddata)
count = 0
for k, v in dictionary.iteritems():
print(k, v)
count += 1
if count > 10:
break
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
bow_corpus = [dictionary.doc2bow(doc) for doc in Processeddata]
bow_corpus[4310]
bow_doc_4310 = bow_corpus[4310]
for i in range(len(bow_doc_4310)):
print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0],
dictionary[bow_doc_4310[i][0]],
bow_doc_4310[i][1]))
TF-IDF
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
pprint(doc)
break
lda_model = models.LdaMulticore(bow_corpus, num_topics=15, id2word=dictionary, passes=5, iterations=50) # lda model
for idx, topic in lda_model.print_topics(-1):
print('Topic: {} \nWords: {}'.format(idx, topic))
#Running LDA using TF-IDF
# I applied TF-IDF to LDA MODEL
lda_model_tfidf = models.LdaMulticore(corpus_tfidf, num_topics=15, id2word=dictionary, passes=5, iterations=50)
for idx, topic in lda_model_tfidf.print_topics(-1):
print('Topic: {} Word: {}'.format(idx, topic))
Processeddata[4310]
# I looked score of topics
for index, score in sorted(lda_model[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 15)))
# I looked score of topics Tf_IDF
for index, score in sorted(lda_model_tfidf[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 15)))
Write an own name parser for the tweets, and consider all names that you find in the dataset as a node of a graph. Add 1 to the weight of an edge if two names occur in the same tweet. With the help of networkx, draw the weighted network of names from the text. Try to find a simple clustering algorithm in networkx, cluster the names in the dataset. Print or visualize your results!
This episode caused severe disappointments in many viewers, because of the sudden death of too many of the favourite characters. Search for some sentiment analysis method, and create a timeline of sentiments based on the tweet texts. Do the sentiments on Twitter reflect the time of the worst scene?
Sentiment analysis (or opinion mining) is a natural language processing technique used to determine whether data is positive, negative or neutral. There are a lot of different ways in determining whether a sentiment of a sentence is considered as positive, negative, or neutral. For this analysis, I used a package named TextBlob to score each sentence spoken by every unique character on our dataset. Scores provided by TextBlob consist of two values which are polarity and subjectivity. Polarity score is between -1 to 1 which define the attitude as positive, negative, or neutral in a statement, while subjectivity score is between 0 to 1 referring to personal opinion, emotion, or judgement. However, I made use of polarity score to support this analysis. positive sentiment : polarity ≥ +0.5 negative sentiment : polarity ≤ -0.5 neutral sentiment : -0.5 < polarity < +0.5 Then I created a timeline for negative sentiments to reflect the time of the worst scene. I saw 1267 tweets are positive, 1832 tweets are negative and 25239 tweets are neutral
data
!pip install textblob #for installation
import textblob #to import
from textblob import TextBlob
data['polarity'] = data.apply(lambda x: TextBlob(x['processed']).sentiment.polarity, axis=1)
data['subjectivity'] = data.apply(lambda x: TextBlob(x['processed']).sentiment.subjectivity, axis=1)
positive_df = data.loc[data['polarity'] >= 0.5]
Overall_positive_sentiment = (len(positive_df)/len(data))
positive_df # I saw 1267 tweets are positive.
all_words = ' '.join([text for text in positive_df['processed']]) # I wanted to look positive words in positive dataset
from wordcloud import WordCloud
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(all_words)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()
Negative = data.loc[data['polarity'] <= -0.5]
Negative # I saw 1832 tweets are negative.
# negative time line
Negative2=Negative[['created_at','processed']]
# I created time window
Negative2['freq']=Negative2.groupby(by='created_at')['created_at'].transform('count')
Negative2['created_at'] = pd.to_datetime(Negative2['created_at'])
dataset_n = Negative2.set_index('created_at')
dataset_n.index
title_font= {"family" : "Cambria",
"size" : 15,
"color" : "black",
"weight" : "bold"}
plt.rcParams.update({'figure.figsize': (10,6), 'figure.dpi': 120})
by_time = dataset_n.groupby(dataset_n.index.time).sum()
hourly_ticks = 2 * 60 * 60 * np.arange(12)
by_time.plot(xticks=hourly_ticks, style='--o', color='brown')
plt.title('Frequency Per Hour for Negative Tweets', fontdict=title_font)
plt.xlabel('hour')
plt.ylabel('frequency')
plt.grid(axis='x')
plt.show();
all_words = ' '.join([text for text in Negative['processed']]) ## I wanted to look negative words in negative dataset
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(all_words)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()
Neutral = data.loc[(data['polarity'] >= -0.5) & (data['polarity'] <= 0.5)]
Neutral # I saw that 25239 tweets are neutral
all_words = ' '.join([text for text in Neutral['processed']]) # I want to show words in neutral dataset tweets.
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(all_words)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()